### setwd("/Users/seantrott/Dropbox/UCSD/Research/Ambiguity/SSD/spanish_norms/src/analysis/")
### Read in all data
df_final = read_csv("../../data/processed/human/sawc_relatedness_full_critical_data.csv")
nrow(df_final)
## [1] 10639
length(unique(df_final$Participant))
## [1] 131
df_ratio = df_final %>%
group_by(Same_sense) %>%
mutate(count_condition = n()) %>%
ungroup() %>%
group_by(Same_sense, Response, count_condition) %>%
summarise(count_response = n()) %>%
mutate(prop_response = count_response / count_condition)
df_ratio %>%
ggplot(aes(x = Response,
y = prop_response)) +
geom_bar(alpha = .6, stat = "identity") +
theme_minimal() +
labs(x = "Relatedness",
y = "P(Response | Condition)") +
facet_wrap(~Same_sense) +
theme(text = element_text(size = 15),
legend.position="none")
df_final %>%
ggplot(aes(x = Response)) +
geom_bar(alpha = .6, stat = "count") +
theme_minimal() +
labs(x = "Relatedness",
y = "Count") +
facet_wrap(~Same_sense) +
theme(text = element_text(size = 15),
legend.position="none")
df_final %>%
group_by(Same_sense) %>%
summarise(m_rel = mean(Response),
sd_rel = sd(Response))
## # A tibble: 2 × 3
## Same_sense m_rel sd_rel
## <chr> <dbl> <dbl>
## 1 Different Sense 2.11 1.41
## 2 Same Sense 4.35 1.14
mod_full = lmer(data = df_final,
Response ~ Same_sense +
(1 + Same_sense | Participant) +
(1 + Same_sense | List) + (1 | Word),
REML = FALSE)
mod_reduced = lmer(data = df_final,
Response ~ # Same_sense +
(1 + Same_sense | Participant) +
(1 + Same_sense | List) + (1 | Word),
REML = FALSE)
summary(mod_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Response ~ Same_sense + (1 + Same_sense | Participant) + (1 +
## Same_sense | List) + (1 | Word)
## Data: df_final
##
## AIC BIC logLik deviance df.resid
## 34198.6 34271.3 -17089.3 34178.6 10629
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8340 -0.6554 -0.1521 0.6411 3.2534
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Participant (Intercept) 0.13240 0.3639
## Same_senseSame Sense 0.13065 0.3614 -0.59
## Word (Intercept) 0.32042 0.5661
## List (Intercept) 0.02791 0.1671
## Same_senseSame Sense 0.07990 0.2827 -0.55
## Residual 1.35993 1.1662
## Number of obs: 10639, groups: Participant, 131; Word, 102; List, 10
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.11454 0.08478 29.45271 24.94 < 2e-16 ***
## Same_senseSame Sense 2.24722 0.09838 9.67540 22.84 9.65e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## Sm_snsSmSns -0.406
anova(mod_full, mod_reduced)
## Data: df_final
## Models:
## mod_reduced: Response ~ (1 + Same_sense | Participant) + (1 + Same_sense | List) + (1 | Word)
## mod_full: Response ~ Same_sense + (1 + Same_sense | Participant) + (1 + Same_sense | List) + (1 | Word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod_reduced 9 34236 34302 -17109 34218
## mod_full 10 34199 34271 -17089 34179 39.589 1 3.135e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### BETO distances
df_beto_distances = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-cased.csv")
nrow(df_beto_distances)
## [1] 10556
### basic items to get Tag
df_sawc_items = read_csv("../../data/raw/items/sawc_sentence_pairs.csv")
### merge together
df_merged = df_beto_distances %>%
left_join(df_sawc_items)
nrow(df_merged)
## [1] 10556
df_list_mean = df_final %>%
group_by(List, Word, Tag) %>%
summarise(mean_relatedness = mean(Response), .groups = "drop",
count = n())
nrow(df_list_mean)
## [1] 812
df_merged_beto = df_merged %>%
inner_join(df_list_mean)
nrow(df_merged_beto)
## [1] 10556
df_by_layer = df_merged_beto %>%
group_by(Layer) %>%
summarise(r = cor(mean_relatedness, Distance, method = "pearson"),
r2 = r ** 2,
rho = cor(mean_relatedness, Distance, method = "spearman"),
count = n())
summary(df_by_layer$rho)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.58780 -0.57770 -0.56818 -0.48504 -0.47098 0.02152
summary(df_by_layer$r2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000542 0.220868 0.305393 0.254792 0.321244 0.332196
summary(df_by_layer$r)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.57636 -0.56678 -0.55262 -0.47774 -0.46997 0.02328
df_by_layer %>%
ggplot(aes(x = Layer,
y = r)) +
geom_line(size = 2,
color = "cornflower blue") +
theme_minimal() +
labs(x = "Layer (BETO)",
y = "Pearson's r") +
scale_x_continuous(breaks = c(0:13)) +
theme(text = element_text(size = 15),
legend.position="none")
df_by_layer %>%
ggplot(aes(x = Layer,
y = rho)) +
geom_line(size = 2,
color = "cornflower blue") +
theme_minimal() +
labs(x = "Layer (BETO)",
y = "Spearman's rho") +
scale_x_continuous(breaks = c(0:13)) +
theme(text = element_text(size = 15),
legend.position="none")
df_by_layer %>%
ggplot(aes(x = Layer,
y = r2)) +
geom_line(size = 2,
color = "cornflower blue") +
theme_minimal() +
labs(x = "Layer (BETO)",
y = "R2") +
scale_x_continuous(breaks = c(0:13)) +
theme(text = element_text(size = 15),
legend.position="none")
min(df_by_layer$rho)
## [1] -0.5878024
df_wide <- df_merged_beto %>%
pivot_wider(
names_from = Layer, # This specifies where to get the names of the new columns
values_from = Distance # This specifies what values to fill the new columns with
) %>%
rename_with(.cols = matches("^[0-9]+$"),
.fn = ~ paste0("Layer", as.integer(.) + 1))
base_formula <- "mean_relatedness ~ "
# Create a vector to hold all layer names that you want to include in the models
layer_names <- colnames(df_wide)[grepl("Layer", colnames(df_wide))]
# Generate the model formulas incrementally
formulas <- sapply(seq_along(layer_names), function(i) {
paste(base_formula, paste(layer_names[1:i], collapse = " + "))
})
# Initialize a vector to store R-squared values
r_squared <- numeric(length(formulas))
# Loop over the formulas
for (i in seq_along(formulas)) {
model <- lm(formulas[i], data = df_wide)
r_squared[i] <- summary(model)$r.squared
}
df_results_expected = data.frame(r_squared, layer_names) %>%
mutate(r2 = r_squared) %>%
mutate(r2_delta = c(NA, diff(r2))) %>%
mutate(Layer = as.numeric(gsub("Layer", "", layer_names)) - 1) %>%
mutate(weighted_layer = Layer * r2_delta)
expected_layer = sum(df_results_expected$weighted_layer, na.rm = TRUE) / sum(df_results_expected$r2_delta, na.rm = TRUE)
expected_layer
## [1] 2.980695
Now, we select the best-performing layer from BETO.
df_beto_l5 = df_merged %>%
filter(Layer == 7) %>%
select(-Same_sense)
nrow(df_beto_l5)
## [1] 812
df_experimental_with_beto = df_final %>%
left_join(df_beto_l5)
nrow(df_experimental_with_beto)
## [1] 10639
mod_full = lmer(data = df_experimental_with_beto,
Response ~ Same_sense + Distance +
(1 + Same_sense + Distance | Participant) +
(1 | List) + (1 | Word),
REML = FALSE)
mod_reduced = lmer(data = df_experimental_with_beto,
Response ~ Distance + # Same_sense +
(1 + Same_sense + Distance | Participant) +
(1 | List) + (1 | Word),
REML = FALSE)
mod_just_same = lmer(data = df_experimental_with_beto,
Response ~ Same_sense + # Distance
(1 + Same_sense + Distance | Participant) +
(1 | List) + (1 | Word),
REML = FALSE)
summary(mod_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
## method [lmerModLmerTest]
## Formula: Response ~ Same_sense + Distance + (1 + Same_sense + Distance |
## Participant) + (1 | List) + (1 | Word)
## Data: df_experimental_with_beto
##
## AIC BIC logLik deviance df.resid
## 33795.9 33883.2 -16886.0 33771.9 10627
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0633 -0.6341 -0.1164 0.6223 3.5636
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## Participant (Intercept) 0.23178 0.4814
## Same_senseSame Sense 0.19301 0.4393 -0.51
## Distance 0.87600 0.9359 -0.69 -0.15
## Word (Intercept) 0.28323 0.5322
## List (Intercept) 0.01825 0.1351
## Residual 1.30359 1.1417
## Number of obs: 10639, groups: Participant, 131; Word, 102; List, 10
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 2.91931 0.09022 74.77499 32.36 <2e-16 ***
## Same_senseSame Sense 1.87369 0.04923 144.30838 38.06 <2e-16 ***
## Distance -4.26971 0.22414 306.55782 -19.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) Sm_sSS
## Sm_snsSmSns -0.389
## Distance -0.523 0.300
anova(mod_full, mod_reduced)
## Data: df_experimental_with_beto
## Models:
## mod_reduced: Response ~ Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## mod_full: Response ~ Same_sense + Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod_reduced 11 34125 34205 -17052 34103
## mod_full 12 33796 33883 -16886 33772 331.55 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod_full, mod_just_same)
## Data: df_experimental_with_beto
## Models:
## mod_just_same: Response ~ Same_sense + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## mod_full: Response ~ Same_sense + Distance + (1 + Same_sense + Distance | Participant) + (1 | List) + (1 | Word)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## mod_just_same 11 34002 34082 -16990 33980
## mod_full 12 33796 33883 -16886 33772 208.27 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### Visualize
df_experimental_with_beto %>%
mutate(distance_binned = ntile(Distance, 20)) %>%
group_by(Same_sense, distance_binned) %>%
summarize(
mean_relatedness = mean(Response),
sd_relatedness = sd(Response),
count = n(),
se_relatedness = sd_relatedness / sqrt(count),
) %>%
ggplot(aes(x = distance_binned,
y = mean_relatedness,
color = Same_sense,
fill = Same_sense)) +
geom_line(size = 1.5) +
geom_ribbon(aes(ymin = mean_relatedness - se_relatedness,
ymax = mean_relatedness + se_relatedness),
alpha = 0.8,
color = NA) +
labs(x = "BETO Cosine Distance (Binned)",
y = "Relatedness",
color = "Same Sense",
fill = "Same Sense") +
theme_minimal() +
scale_fill_manual(values = my_colors) +
scale_color_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="bottom")
### First, get group means
df_list_mean = df_experimental_with_beto %>%
group_by(List, Word, Tag, Same_sense) %>%
summarise(mean_relatedness = mean(Response), .groups = "drop",
count = n(),
distance = mean(Distance))
nrow(df_list_mean)
## [1] 812
### Get BETO cor
BETO_COR = abs(cor(df_list_mean$mean_relatedness,
df_list_mean$distance, method = "spearman"))
### Now, iterate through ppts
ppts = unique(df_final$Participant)
df_r = data.frame()
for (ppt in ppts) {
# Subset df_critical for the current participant
individual_data <- df_final %>%
filter(Participant == ppt) %>%
select(List, Word, Tag, Response) # Ensure you're selecting the needed columns
# Merge individual data with mean data
merged_data <- inner_join(individual_data, df_list_mean, by = c("List", "Word", "Tag"))
test = cor.test(merged_data$Response,
merged_data$mean_relatedness,
method = "spearman")
df_test = broom::tidy(test)
df_test$ppt = ppt
df_test$List = unique(individual_data$List)
df_r = rbind(df_r, df_test)
}
### Visualization
df_r %>%
ggplot(aes(x = estimate)) +
geom_histogram(alpha = .6) +
scale_x_continuous(limits = c(0, 1)) +
theme_minimal() +
geom_vline(xintercept = BETO_COR, size = 1.5,
linetype = "dashed", alpha = .7) +
labs(x = "Leave-one-out Correlation") +
theme(text = element_text(size = 15),
legend.position="none")
summary(df_r$estimate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3946 0.7480 0.7962 0.7725 0.8308 0.8834
### What proportion of inter-annotator agreement scores are larger?
prop_larger = df_r %>%
mutate(larger = estimate >= BETO_COR) %>%
summarise(mean(larger))
prop_larger
## # A tibble: 1 × 1
## `mean(larger)`
## <dbl>
## 1 0.947
### Full model
summary(lm(data = df_list_mean,
mean_relatedness ~ Same_sense + distance))
##
## Call:
## lm(formula = mean_relatedness ~ Same_sense + distance, data = df_list_mean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.43999 -0.52874 -0.05629 0.42901 2.47200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.90576 0.07835 37.09 <2e-16 ***
## Same_senseSame Sense 1.86552 0.06724 27.74 <2e-16 ***
## distance -4.08053 0.36825 -11.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7658 on 809 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.6569
## F-statistic: 777.5 on 2 and 809 DF, p-value: < 2.2e-16
### Just same sense
summary(lm(data = df_list_mean,
mean_relatedness ~ Same_sense))
##
## Call:
## lm(formula = mean_relatedness ~ Same_sense, data = df_list_mean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2490 -0.6487 -0.1114 0.4836 2.6364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.11357 0.03437 61.49 <2e-16 ***
## Same_senseSame Sense 2.22638 0.06310 35.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8214 on 810 degrees of freedom
## Multiple R-squared: 0.6058, Adjusted R-squared: 0.6054
## F-statistic: 1245 on 1 and 810 DF, p-value: < 2.2e-16
### Just cosine distance
summary(lm(data = df_list_mean,
mean_relatedness ~ distance))
##
## Call:
## lm(formula = mean_relatedness ~ distance, data = df_list_mean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.93439 -0.81273 -0.03441 0.87319 2.71132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.29020 0.08432 50.88 <2e-16 ***
## distance -9.02865 0.44979 -20.07 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.069 on 810 degrees of freedom
## Multiple R-squared: 0.3322, Adjusted R-squared: 0.3314
## F-statistic: 402.9 on 1 and 810 DF, p-value: < 2.2e-16
# Calculate mean and SE of Distance
df_summary <- df_beto_distances %>%
group_by(Layer, Same_sense) %>%
summarise(
mean_Distance = mean(Distance),
sd_Distance = sd(Distance),
count = n(),
se_Distance = sd_Distance / sqrt(count),
.groups = 'drop' # Drop the automatic grouping by dplyr
)
df_summary %>%
ggplot(aes(x = Layer,
y = mean_Distance,
color = Same_sense,
fill = Same_sense)) +
geom_line(size = 1.5) +
geom_ribbon(aes(ymin = mean_Distance - 2 * se_Distance,
ymax = mean_Distance + 2 * se_Distance),
alpha = 0.8,
color = NA) +
labs(x = "Layer (BETO)",
y = "Mean Distance",
color = "Same Sense",
fill = "Same Sense") +
scale_x_continuous(breaks = c(0:13)) +
theme_minimal() +
# scale_fill_viridis(option = "mako", discrete=TRUE) +
# scale_color_viridis(option = "mako", discrete=TRUE) +
scale_fill_manual(values = my_colors) +
scale_color_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="bottom")
We also build a logistic regression model for each layer.
df_wide <- df_beto_distances %>%
pivot_wider(
names_from = Layer, # This specifies where to get the names of the new columns
values_from = Distance # This specifies what values to fill the new columns with
) %>%
rename_with(.cols = matches("^[0-9]+$"),
.fn = ~ paste0("Layer", as.integer(.) + 1))
base_formula <- "Same_sense ~ "
# Create a vector to hold all layer names that you want to include in the models
layer_names <- colnames(df_wide)[grepl("Layer", colnames(df_wide))]
# Generate the model formulas incrementally: individual
formulas_individual <- sapply(seq_along(layer_names), function(i) {
paste(base_formula, paste(layer_names[i], collapse = " + "))
})
# Generate the model formulas incrementally: altogether
formulas_combined <- sapply(seq_along(layer_names), function(i) {
paste(base_formula, paste(layer_names[1:i], collapse = " + "))
})
aic_individual <- numeric(length(formulas_individual))
aic_together <- numeric(length(formulas_combined))
# Loop over the formulas
for (i in seq_along(formulas_combined)) {
model <- glm(formulas_combined[i], data = df_wide, family = binomial())
aic_together[i] <- summary(model)$aic
}
# Loop over the formulas
for (i in seq_along(formulas_individual)) {
model <- glm(formulas_individual[i], data = df_wide, family = binomial())
aic_individual[i] <- summary(model)$aic
}
#
df_results_expected = data.frame(aic_individual, aic_together, layer_names) %>%
mutate(aic_delta = -c(NA, diff(aic_together))) %>%
mutate(Layer = as.numeric(gsub("Layer", "", layer_names)) - 1) %>%
mutate(weighted_layer = Layer * aic_delta)
expected_layer = sum(df_results_expected$weighted_layer, na.rm = TRUE) / sum(df_results_expected$aic_delta, na.rm = TRUE)
expected_layer
## [1] 3.208745
df_results_expected %>%
filter(aic_individual == min(df_results_expected$aic_individual))
## aic_individual aic_together layer_names aic_delta Layer weighted_layer
## 1 729.0844 720.9658 Layer6 4.348899 5 21.7445
df_item_means = df_final %>%
group_by(List, Word, Same_sense, Sentence_1, Sentence_2,
Sense_id_s1, Sense_id_s2, Gender_s1, Gender_s2) %>%
summarise(mean_relatedness = mean(Response),
sd_relatedness = sd(Response),
median_relatedness = median(Response),
count = n())
df_item_means %>%
ggplot(aes(x = mean_relatedness)) +
geom_histogram(alpha = .6, bins = 10) +
theme_minimal() +
labs(x = "Mean Relatedness",
y = "Count") +
facet_wrap(~Same_sense) +
theme(text = element_text(size = 15),
legend.position="none")
df_item_means %>%
ggplot(aes(x = mean_relatedness,
y = Same_sense,
fill = Same_sense)) +
geom_density_ridges2(aes(height = ..density..),
color=gray(0.25),
alpha = .7,
scale=.85,
# size=1,
size = 0,
stat="density") +
labs(x = "Mean Relatedness",
y = "",
fill = "") +
theme_minimal()+
scale_fill_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="none")
df_beto = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-cased.csv") %>%
mutate(Model = "BETO-cased",
Multilingual = "Monolingual")
df_xlm = read_csv("../../data/processed/models/sawc-distances_model-xlm-roberta-base.csv") %>%
mutate(Model = "XLM-RoBERTa",
Multilingual = "Multilingual")
df_mb = read_csv("../../data/processed/models/sawc-distances_model-bert-base-multilingual-cased.csv") %>%
mutate(Model = "Multilingual BERT",
Multilingual = "Multilingual")
df_db = read_csv("../../data/processed/models/sawc-distances_model-distilbert-base-spanish-uncased.csv") %>%
mutate(Model = "DistilBETO",
Multilingual = "Monolingual")
df_ab = read_csv("../../data/processed/models/sawc-distances_model-albert-base-spanish.csv") %>%
mutate(Model = "ALBERT-base",
Multilingual = "Monolingual")
df_at = read_csv("../../data/processed/models/sawc-distances_model-albert-tiny-spanish.csv") %>%
mutate(Model = "ALBERT-tiny",
Multilingual = "Monolingual")
df_axl = read_csv("../../data/processed/models/sawc-distances_model-albert-xlarge-spanish.csv") %>%
mutate(Model = "ALBERT-xlarge",
Multilingual = "Monolingual")
df_al = read_csv("../../data/processed/models/sawc-distances_model-albert-large-spanish.csv") %>%
mutate(Model = "ALBERT-large",
Multilingual = "Monolingual")
df_axxl = read_csv("../../data/processed/models/sawc-distances_model-albert-xxlarge-spanish.csv") %>%
mutate(Model = "ALBERT-xxlarge",
Multilingual = "Monolingual")
df_rb = read_csv("../../data/processed/models/sawc-distances_model-roberta-base-bne.csv") %>%
mutate(Model = "RoBERTa-base",
Multilingual = "Monolingual")
df_rl = read_csv("../../data/processed/models/sawc-distances_model-roberta-large-bne.csv") %>%
mutate(Model = "RoBERTa-large",
Multilingual = "Monolingual")
df_beto_uncased = read_csv("../../data/processed/models/sawc-distances_model-bert-base-spanish-wwm-uncased.csv") %>%
mutate(Model = "BETO-uncased",
Multilingual = "Monolingual")
df_all = df_beto %>%
bind_rows(df_xlm) %>%
bind_rows(df_db) %>%
bind_rows(df_ab) %>%
bind_rows(df_at) %>%
bind_rows(df_mb) %>%
bind_rows(df_axl) %>%
bind_rows(df_al) %>%
bind_rows(df_axxl) %>%
bind_rows(df_rb) %>%
bind_rows(df_rl) %>%
bind_rows(df_beto_uncased)
nrow(df_all)
## [1] 144536
### basic items to get Tag
df_sawc_items = read_csv("../../data/raw/items/sawc_sentence_pairs.csv")
### merge together
df_merged = df_all %>%
left_join(df_sawc_items)
df_list_mean = df_final %>%
group_by(List, Word, Tag) %>%
summarise(mean_relatedness = mean(Response), .groups = "drop",
count = n())
nrow(df_list_mean)
## [1] 812
df_merged = df_merged %>%
inner_join(df_list_mean)
nrow(df_merged)
## [1] 144536
df_by_layer = df_merged %>%
group_by(Model, Multilingual, Layer, n_params) %>%
summarise(r = cor(mean_relatedness, Distance, method = "pearson"),
r2 = r ** 2,
rho = cor(mean_relatedness, Distance, method = "spearman"),
count = n())
max(df_by_layer$r2)
## [1] 0.3321961
df_by_layer %>%
filter(r2 == max(df_by_layer$r2))
## # A tibble: 1 × 8
## # Groups: Model, Multilingual, Layer [1]
## Model Multilingual Layer n_params r r2 rho count
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 BETO-cased Monolingual 7 109850880 -0.576 0.332 -0.587 812
df_by_layer %>%
filter(Model != "ALBERT-tiny") %>%
filter(Model != "ALBERT-xlarge") %>%
filter(Model != "DistilBETO") %>%
filter(Model != "ALBERT-large") %>%
filter(Model != "RoBERTa-large") %>%
ggplot(aes(x = Layer,
y = r2,
color = Model,
linetype = Model)) +
geom_line(size = 2) +
theme_minimal() +
labs(x = "Layer",
y = "R2",
color = "",
linetype = "") +
scale_x_continuous(breaks = c(0:13)) +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 12),
legend.position="bottom")
df_by_layer %>%
filter(Model != "ALBERT-tiny") %>%
filter(Model != "ALBERT-xlarge") %>%
filter(Model != "DistilBETO") %>%
filter(Model != "ALBERT-large") %>%
filter(Model != "RoBERTa-large") %>%
ggplot(aes(x = Layer,
y = r2,
color = Model)) +
geom_line(size = 2) +
theme_minimal() +
labs(x = "Layer",
y = "R2",
color = "") +
scale_x_continuous(breaks = c(0:13)) +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 12),
legend.position="bottom")
df_by_layer %>%
filter(Model %in% c("ALBERT-xlarge",
"ALBERT-large",
"RoBERTa-large")) %>%
ggplot(aes(x = Layer,
y = r2,
color = Model)) +
geom_line(size = 2) +
theme_minimal() +
labs(x = "Layer",
y = "R2",
color = "") +
scale_color_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 15),
legend.position="bottom")
We can also break this down by looking at the proportion of layers through the model.
df_families = data.frame(
Model = unique(df_by_layer$Model),
Family = c("ALBERT", "ALBERT", "ALBERT", "ALBERT",
"ALBERT", "BERT", "BERT", "BERT", "BERT",
"RoBERTa", "RoBERTa", "RoBERTa")
)
df_by_layer = df_by_layer %>%
group_by(Model) %>%
mutate(max_layer = max(Layer),
prop_layer = Layer / max_layer) %>%
inner_join(df_families)
df_by_layer %>%
ggplot(aes(x = prop_layer,
y = r2)) +
geom_line(size = .5, aes(color = Model, group = Model),
alpha = .3) +
stat_summary(
fun = mean, # calculate mean
geom = "line", # use line geometry
size = 2, # set line size
na.rm = TRUE # remove NA values
) +
theme_minimal() +
labs(x = "Proportion of Total Layers",
y = "R2",
color = "") + # Renaming legend title appropriately
scale_color_viridis(option = "mako", discrete = TRUE) +
theme(text = element_text(size = 15),
legend.position = "none")
df_by_layer %>%
ggplot(aes(x = prop_layer,
y = r2,
color = Family)) +
geom_smooth(size = 2) +
scale_color_viridis(option = "mako", discrete = TRUE) +
theme_minimal() +
labs(x = "Proportion of Total Layers",
y = "R2",
color = "Family") +
theme(text = element_text(size = 15),
legend.position = "bottom")
df_by_layer %>%
mutate(binned_prop_layer = ntile(prop_layer, 10)) %>%
mutate(prop_binned = binned_prop_layer / 10) %>%
ggplot(aes(x = prop_binned,
y = r2)) +
stat_summary(
aes(group = Family,
color = Family),
fun = mean,
geom = "line",
size = 2
) +
stat_summary(
aes(group = Family,
fill = Family),
fun.data = mean_se,
geom = "ribbon",
alpha = 0.2,
color = NA
) +
theme_minimal() +
labs(x = "Layer Depth Ratio",
y = "R2",
color = "Family") +
scale_color_viridis(option = "mako", discrete = TRUE) +
theme(text = element_text(size = 15),
legend.position = "bottom")
The BETO/ALBERT/DistilBETO family of models provide a nice within-family comparison of the effect of number of parameters, since they are trained on the same datasets.
df_max_r2 = df_by_layer %>%
group_by(Model, n_params, Multilingual) %>%
summarise(max_r2 = max(r2))
df_max_r2 %>%
ggplot(aes(x = n_params,
y = max_r2,
color = Multilingual,
shape = Multilingual)) +
geom_point(size = 6,
alpha = .7) +
geom_hline(yintercept = mean(df_r$estimate)**2, ### Human accuracy
linetype = "dotted", color = "red",
size = 1.2, alpha = .5) +
geom_text_repel(aes(label=Model), size=3) +
scale_x_log10() +
scale_y_continuous(limits = c(0, 1)) +
labs(x = "Parameters",
y = "Maximum R2",
color = "",
shape = "") +
theme_minimal() +
# guides(color="none") +
# scale_color_viridis(option = "mako", discrete=TRUE) +
scale_color_manual(values = my_colors) +
theme(text = element_text(size = 15),
legend.position="bottom")
Same vs. Different Sense residuals# Fit models and calculate residuals for each LLM
models <- by(df_merged, df_merged$Model, function(subdata) {
model <- lm(mean_relatedness ~ Distance * Layer, data = subdata)
subdata$residuals <- residuals(model)
return(subdata)
})
# Combine the results back into a single dataframe
results <- do.call(rbind, models)
# Modify Same_sense
results = results %>%
mutate(Same_sense = case_when(
Same_sense == TRUE ~ "Same Sense",
Same_sense == FALSE ~ "Different Sense"
))
results %>%
ggplot(aes(x = residuals,
fill = Same_sense)) +
geom_density(alpha = .7, size = 0) +
geom_vline(xintercept = 0, size = .6, linetype = "dashed") +
theme_minimal() +
# scale_fill_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 15),
legend.position="bottom") +
scale_fill_manual(values = my_colors) +
labs(x = "Residuals (Relatedness ~ Distance * Layer)",
fill = "") +
facet_wrap(~Model)
TODO: In progress; creating correlation matrix that factors in demographic data and compares this also to Chilean BERT.
### Load Chilean BERT
df_chilean_bert = read_csv("../../data/processed/models/sawc-distances_model-patana-chilean-spanish-bert.csv") %>%
mutate(Model = "Chilean-BERT", Multilingual = "Monolingual")
### Merge with model
df_all = df_all %>%
bind_rows(df_chilean_bert)
nrow(df_all)
## [1] 155092
### Load relatedness by demographic
df_demographic_averages = read_csv("../../data/processed/human/sawc_relatedness_by_demographic.csv") %>%
select(-Same_sense)
### Merge together
df_llm_plus_demo = df_demographic_averages %>%
inner_join(df_all) %>%
select(Sentence_1, Sentence_2, Word,
mean_relatedness_Chile, mean_relatedness_México, mean_relatedness_España,
Model, Distance, Layer) %>%
filter(Layer == 4) ### Just test
### Best layer per model
df_llm_plus_demo
## # A tibble: 10,556 × 9
## Sentence_1 Sente…¹ Word mean_…² mean_…³ mean_…⁴ Model Dista…⁵ Layer
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Compró el aceite d… Compró… acei… 4.5 3.18 NA BETO… 0.0729 4
## 2 Compró el aceite d… Compró… acei… 4.5 3.18 NA XLM-… 0.0110 4
## 3 Compró el aceite d… Compró… acei… 4.5 3.18 NA Dist… 0.0369 4
## 4 Compró el aceite d… Compró… acei… 4.5 3.18 NA ALBE… 0.0211 4
## 5 Compró el aceite d… Compró… acei… 4.5 3.18 NA ALBE… 0.0277 4
## 6 Compró el aceite d… Compró… acei… 4.5 3.18 NA Mult… 0.0209 4
## 7 Compró el aceite d… Compró… acei… 4.5 3.18 NA ALBE… 0.00694 4
## 8 Compró el aceite d… Compró… acei… 4.5 3.18 NA ALBE… 0.0201 4
## 9 Compró el aceite d… Compró… acei… 4.5 3.18 NA ALBE… 0.0344 4
## 10 Compró el aceite d… Compró… acei… 4.5 3.18 NA RoBE… 0.0392 4
## # … with 10,546 more rows, and abbreviated variable names ¹Sentence_2,
## # ²mean_relatedness_Chile, ³mean_relatedness_México,
## # ⁴mean_relatedness_España, ⁵Distance
## # ℹ Use `print(n = ...)` to see more rows
### Pivot wide
s = df_llm_plus_demo %>%
pivot_wider(names_from = Model,
values_from = Distance)
### Extract relevant columns
relevant_columns = s %>%
mutate(Chile = mean_relatedness_Chile,
México = mean_relatedness_México,
España = mean_relatedness_España) %>%
select(Chile, México, España,
`BETO-cased`, `XLM-RoBERTa`,
DistilBETO, `ALBERT-base`, `ALBERT-tiny`,
`Multilingual BERT`, `ALBERT-xlarge`, `ALBERT-large`,
`ALBERT-xxlarge`, `RoBERTa-base`, `RoBERTa-large`, `BETO-uncased`,
`Chilean-BERT`
)
### absolute value
relevant_cors = abs(cor(relevant_columns, use = "complete.obs"))
relevant_cors
## Chile México España BETO-cased XLM-RoBERTa
## Chile 1.0000000 0.8508143 0.8238417 0.4994681 0.2236498
## México 0.8508143 1.0000000 0.8747620 0.5482607 0.2776124
## España 0.8238417 0.8747620 1.0000000 0.5512521 0.2524410
## BETO-cased 0.4994681 0.5482607 0.5512521 1.0000000 0.4109759
## XLM-RoBERTa 0.2236498 0.2776124 0.2524410 0.4109759 1.0000000
## DistilBETO 0.3745862 0.4047428 0.3953257 0.6350121 0.3220583
## ALBERT-base 0.3593834 0.3871968 0.3666261 0.5892588 0.4230497
## ALBERT-tiny 0.3063879 0.3365394 0.2986687 0.4209135 0.3284829
## Multilingual BERT 0.3110919 0.3362316 0.3319716 0.6122717 0.4232802
## ALBERT-xlarge 0.3404443 0.3609759 0.3480030 0.6055391 0.3848763
## ALBERT-large 0.2542477 0.2860133 0.2554225 0.5353650 0.3843066
## ALBERT-xxlarge 0.2807636 0.3015163 0.2961813 0.5704506 0.3846120
## RoBERTa-base 0.4147787 0.4424037 0.4519473 0.7277146 0.3940679
## RoBERTa-large 0.3482683 0.3698964 0.3737819 0.6685210 0.4107094
## BETO-uncased 0.3049111 0.3410787 0.3617072 0.6036837 0.3451527
## Chilean-BERT 0.4314856 0.4718126 0.4595117 0.8521254 0.4434771
## DistilBETO ALBERT-base ALBERT-tiny Multilingual BERT
## Chile 0.3745862 0.3593834 0.3063879 0.3110919
## México 0.4047428 0.3871968 0.3365394 0.3362316
## España 0.3953257 0.3666261 0.2986687 0.3319716
## BETO-cased 0.6350121 0.5892588 0.4209135 0.6122717
## XLM-RoBERTa 0.3220583 0.4230497 0.3284829 0.4232802
## DistilBETO 1.0000000 0.5504276 0.5686367 0.5004061
## ALBERT-base 0.5504276 1.0000000 0.5175161 0.6514023
## ALBERT-tiny 0.5686367 0.5175161 1.0000000 0.3949596
## Multilingual BERT 0.5004061 0.6514023 0.3949596 1.0000000
## ALBERT-xlarge 0.6089505 0.7189638 0.6704144 0.5878105
## ALBERT-large 0.5962305 0.7837651 0.6559533 0.6192542
## ALBERT-xxlarge 0.5839987 0.5835854 0.6802126 0.5352473
## RoBERTa-base 0.6180596 0.5649007 0.4709227 0.5408775
## RoBERTa-large 0.6102204 0.4749476 0.4949663 0.5322389
## BETO-uncased 0.8248177 0.4594634 0.4341077 0.4176955
## Chilean-BERT 0.6471521 0.5941260 0.4867991 0.5682728
## ALBERT-xlarge ALBERT-large ALBERT-xxlarge RoBERTa-base
## Chile 0.3404443 0.2542477 0.2807636 0.4147787
## México 0.3609759 0.2860133 0.3015163 0.4424037
## España 0.3480030 0.2554225 0.2961813 0.4519473
## BETO-cased 0.6055391 0.5353650 0.5704506 0.7277146
## XLM-RoBERTa 0.3848763 0.3843066 0.3846120 0.3940679
## DistilBETO 0.6089505 0.5962305 0.5839987 0.6180596
## ALBERT-base 0.7189638 0.7837651 0.5835854 0.5649007
## ALBERT-tiny 0.6704144 0.6559533 0.6802126 0.4709227
## Multilingual BERT 0.5878105 0.6192542 0.5352473 0.5408775
## ALBERT-xlarge 1.0000000 0.7917851 0.7461274 0.5477237
## ALBERT-large 0.7917851 1.0000000 0.7118564 0.5302841
## ALBERT-xxlarge 0.7461274 0.7118564 1.0000000 0.5306079
## RoBERTa-base 0.5477237 0.5302841 0.5306079 1.0000000
## RoBERTa-large 0.5774232 0.5414029 0.5628845 0.7685964
## BETO-uncased 0.4750966 0.4947804 0.5216844 0.5498036
## Chilean-BERT 0.5725059 0.5746295 0.5734939 0.7260469
## RoBERTa-large BETO-uncased Chilean-BERT
## Chile 0.3482683 0.3049111 0.4314856
## México 0.3698964 0.3410787 0.4718126
## España 0.3737819 0.3617072 0.4595117
## BETO-cased 0.6685210 0.6036837 0.8521254
## XLM-RoBERTa 0.4107094 0.3451527 0.4434771
## DistilBETO 0.6102204 0.8248177 0.6471521
## ALBERT-base 0.4749476 0.4594634 0.5941260
## ALBERT-tiny 0.4949663 0.4341077 0.4867991
## Multilingual BERT 0.5322389 0.4176955 0.5682728
## ALBERT-xlarge 0.5774232 0.4750966 0.5725059
## ALBERT-large 0.5414029 0.4947804 0.5746295
## ALBERT-xxlarge 0.5628845 0.5216844 0.5734939
## RoBERTa-base 0.7685964 0.5498036 0.7260469
## RoBERTa-large 1.0000000 0.5811935 0.6643691
## BETO-uncased 0.5811935 1.0000000 0.5858514
## Chilean-BERT 0.6643691 0.5858514 1.0000000
corrplot(relevant_cors,
method = "shade",
# type = "upper",
tl.col = "black")
breaks <- seq(-1, 1, length.out=101)
my_palette <- colorRampPalette(c("lightblue", "white", "lightcoral"))(100)
heatmap.2(relevant_cors,
breaks=breaks,
col=my_palette,
main="Correlations",
trace="none",
dendrogram="row",
Rowv=TRUE,
Colv=FALSE,
scale="none",
key=TRUE,
keysize=1.2,
symm=TRUE, # symmetric heatmap
density.info="none",
margins=c(6,6),
cexRow=0.9, # Adjust font size of row labels if needed
cexCol=0.9, # Adjust font size of column labels if needed
# cellnote=round(relevant_cors, 2), # Add correlation values to cells
notecol="black", # Color for cell values
notecex=0.8) # Font size for cell values
df_sawc = read_csv("../../data/processed/human/sawc_avg_relatedness.csv")
df_gpt4 = read_csv("../../data/processed/models/sawc-ratings-gpt4-turbo.csv")
df_merged_gpt4 = df_gpt4 %>%
inner_join(df_sawc)
nrow(df_merged_gpt4)
## [1] 812
cor.test(df_merged_gpt4$GPT_rating, df_merged_gpt4$mean_relatedness,
method = "spearman")
##
## Spearman's rank correlation rho
##
## data: df_merged_gpt4$GPT_rating and df_merged_gpt4$mean_relatedness
## S = 17918116, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.7991942
cor.test(df_merged_gpt4$GPT_rating, df_merged_gpt4$mean_relatedness,
method = "pearson")
##
## Pearson's product-moment correlation
##
## data: df_merged_gpt4$GPT_rating and df_merged_gpt4$mean_relatedness
## t = 40.53, df = 810, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7943084 0.8398946
## sample estimates:
## cor
## 0.8183849
mod_full = lm(data = df_merged_gpt4,
mean_relatedness ~ Same_sense + GPT_rating)
summary(mod_full)
##
## Call:
## lm(formula = mean_relatedness ~ Same_sense + GPT_rating, data = df_merged_gpt4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.33981 -0.45438 -0.04765 0.40039 1.92798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.11995 0.05412 20.69 <2e-16 ***
## Same_senseSame Sense 1.13595 0.07189 15.80 <2e-16 ***
## GPT_rating 0.45208 0.02120 21.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6577 on 809 degrees of freedom
## Multiple R-squared: 0.7476, Adjusted R-squared: 0.747
## F-statistic: 1198 on 2 and 809 DF, p-value: < 2.2e-16
mod_gpt = lm(data = df_merged_gpt4,
mean_relatedness ~ GPT_rating)
summary(mod_gpt)
##
## Call:
## lm(formula = mean_relatedness ~ GPT_rating, data = df_merged_gpt4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.88145 -0.43840 0.01484 0.49944 2.62128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.76252 0.05621 13.56 <2e-16 ***
## GPT_rating 0.69045 0.01704 40.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7519 on 810 degrees of freedom
## Multiple R-squared: 0.6698, Adjusted R-squared: 0.6693
## F-statistic: 1643 on 1 and 810 DF, p-value: < 2.2e-16
mod_sense = lm(data = df_merged_gpt4,
mean_relatedness ~ Same_sense)
summary(mod_sense)
##
## Call:
## lm(formula = mean_relatedness ~ Same_sense, data = df_merged_gpt4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2490 -0.6487 -0.1114 0.4836 2.6364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.11357 0.03437 61.49 <2e-16 ***
## Same_senseSame Sense 2.22638 0.06310 35.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8214 on 810 degrees of freedom
## Multiple R-squared: 0.6058, Adjusted R-squared: 0.6054
## F-statistic: 1245 on 1 and 810 DF, p-value: < 2.2e-16
df_merged_gpt4$residuals = residuals(mod_gpt)
df_merged_gpt4 %>%
ggplot(aes(x = residuals,
fill = Same_sense)) +
geom_density(alpha = .7, size = 0) +
geom_vline(xintercept = 0, size = .6, linetype = "dashed") +
theme_minimal() +
# scale_fill_viridis(option = "mako", discrete=TRUE) +
theme(text = element_text(size = 15),
legend.position="bottom") +
scale_fill_manual(values = my_colors) +
labs(x = "Residuals (GPT-4 Turbo Ratings)",
fill = "")